The most recent update of this html document occurred: Wed Nov 30 11:01:04 2016

> library(knitr)
> library(ggplot2)
> library(reshape)
> library(DESeq2)
> library(CHBUtils)
> library(limma)
> library(gtools)
> library(gridExtra)
> library(devtools)
> library(dplyr)
> library(tidyr)
> library(pheatmap)
> library(rio)
> exp1 = import("Experiment1_Data.xlsx")
> exp1$Gene_Name = gsub("\\'", "", exp1$Gene_Name)
> colnames(exp1) = gsub("\\+", "p", colnames(exp1))
> exp2 = import("Experiment2_Data.xlsx")
> exp2$Gene_Name = gsub("\\'", "", exp2$Gene_Name)
> colnames(exp2) = gsub("\\+", "p", colnames(exp2))
> 
> data = inner_join(exp1[, 1:11] %>% filter(Gene_Name != ""), exp2[, 1:11] %>% 
+     filter(Gene_Name != ""), by = "Gene_Name") %>% select(Gene_Name, Entrez_Gene_ID = Entrez_Gene_ID.x, 
+     Description = Description.x, d1_nt = nt.x, d1_sh17 = sh17.x, d1_sh16 = sh16.x, 
+     d1_ntp = ntp.x, d1_sh17p = sh17p.x, d1_sh16p = sh16p.x, d2_nt = nt.y, d2_sh17 = sh17.y, 
+     d2_sh16 = sh16.y, d2_ntp = ntp.y, d2_sh17p = sh17p.y, d2_sh16p = sh16p.y)
> 
> counts = data[, 4:15]
> row.names(counts) = data$Gene_Name
> 
> 
> metadata = rbind(data.frame(donor = "donor1", group = colnames(exp1)[3:8]) %>% 
+     mutate(name = paste0("d1_", group)), data.frame(donor = "donor2", group = colnames(exp2)[3:8]) %>% 
+     mutate(name = paste0("d2_", group))) %>% mutate(treat = ifelse(grepl("p$", 
+     group), "plus", "minus")) %>% mutate(type = gsub("p$", "", group))
> rownames(metadata) = metadata$name
> metadata = metadata[colnames(counts), ]
> metadata
          donor group     name treat type
d1_nt    donor1    nt    d1_nt minus   nt
d1_sh17  donor1  sh17  d1_sh17 minus sh17
d1_sh16  donor1  sh16  d1_sh16 minus sh16
d1_ntp   donor1   ntp   d1_ntp  plus   nt
d1_sh17p donor1 sh17p d1_sh17p  plus sh17
d1_sh16p donor1 sh16p d1_sh16p  plus sh16
d2_nt    donor2    nt    d2_nt minus   nt
d2_sh17  donor2  sh17  d2_sh17 minus sh17
d2_sh16  donor2  sh16  d2_sh16 minus sh16
d2_ntp   donor2   ntp   d2_ntp  plus   nt
d2_sh17p donor2 sh17p d2_sh17p  plus sh17
d2_sh16p donor2 sh16p d2_sh16p  plus sh16

Distribution of expression values

> counts = counts[rowSums(counts[, 1:6] > 0) > 2 & rowSums(counts[, 6:12] > 0) > 
+     2, ]
> 
> log2counts = log2(counts + 1)
> 
> ggplot(melt(log2counts), aes(x = value, color = variable)) + geom_density() + 
+     theme_bw()

> mds(log2counts, k = 2, condition = metadata$donor) + theme_bw()

## Differential analysis with DESeq2

Just using DESeq2 without any further addition to the analysis

Dispersion values

> dds = DESeqDataSetFromMatrix(counts, metadata, design = ~donor + treat + type)
> # rdds = rlog(dds)
> dds = DESeq(dds)
> DESeq2::plotDispEsts(dds)

> sf <- sizeFactors(dds)
> disp <- dispersions(dds)
> dds_group = DESeqDataSetFromMatrix(counts, metadata, design = ~donor + group)
> sizeFactors(dds_group) <- sf
> dispersions(dds_group) <- disp
> dds_group <- nbinomWaldTest(dds_group)

How the gene SET looks like:

> res_sh16_vs_control = results(dds_group, list("groupsh16", "groupnt"))
> suppressWarnings(DEGreport::degPlot(dds_group, res_sh16_vs_control["SET", , 
+     drop = FALSE], n = 1, xs = "treat", group = "type", batch = "donor"))

WT treatment effect

  • nt vs nt+
> res_nt_vs_plus = results(dds_group, list("groupntp", "groupnt"))
> DESeq2::plotMA(res_nt_vs_plus)

> res = res_nt_vs_plus %>% as.data.frame() %>% tibble::rownames_to_column("id") %>% 
+     arrange(padj) %>% tibble::column_to_rownames("id")
> 
> res %>% head(15) %>% knitr::kable()
baseMean log2FoldChange lfcSE stat pvalue padj
CYBB 2900830.1 1.2927340 0.1527038 8.465631 0e+00 0.00e+00
AKR1B1 11388704.5 -0.8760231 0.1266923 -6.914573 0e+00 0.00e+00
DYSF 20773280.4 1.3837569 0.2035469 6.798222 0e+00 0.00e+00
ACSL1 16567561.0 1.0930494 0.1643207 6.651929 0e+00 1.00e-07
CYBA 1776465.4 1.1292498 0.1735012 6.508601 0e+00 1.00e-07
AKAP13 9230549.6 0.4631490 0.0736832 6.285681 0e+00 4.00e-07
DHRS9 825946.3 1.0590918 0.1797898 5.890722 0e+00 3.60e-06
RAB43 1060144.4 0.6151003 0.1040928 5.909156 0e+00 3.60e-06
MLKL 1774305.4 0.6338305 0.1080660 5.865217 0e+00 3.80e-06
GRAMD1A 249876.0 0.7678239 0.1340367 5.728459 0e+00 7.70e-06
CEP170 7763560.4 0.3140712 0.0552059 5.689086 0e+00 8.80e-06
UTRN 17584830.4 0.7712507 0.1393132 5.536092 0e+00 1.95e-05
MMP8 374974.2 0.9100422 0.1689469 5.386556 1e-07 4.18e-05
CHMP2A 4830981.4 0.4540989 0.0846320 5.365572 1e-07 4.36e-05
FGR 2195862.8 0.9091360 0.1704786 5.332846 1e-07 4.87e-05
> volcano_density_plot(res[, c("log2FoldChange", "pvalue")], shade.colour = "orange")

> suppressWarnings(DEGreport::degPlot(dds_group, res, n = 9, xs = "treat", group = "type", 
+     batch = "donor"))

> deseq2_ntplus_effect = res

sh16 treatment effect

  • sh16 vs sh16+
> res_sh16_vs_plus = results(dds_group, list("groupsh16p", "groupsh16"))
> DESeq2::plotMA(res_sh16_vs_plus)

> res = res_sh16_vs_plus %>% as.data.frame() %>% tibble::rownames_to_column("id") %>% 
+     arrange(padj) %>% tibble::column_to_rownames("id")
> 
> res %>% head(15) %>% knitr::kable()
baseMean log2FoldChange lfcSE stat pvalue padj
ACSL1 16567561.0 1.2779508 0.1643207 7.777172 0e+00 0.0000000
CYBB 2900830.1 1.1087726 0.1527041 7.260924 0e+00 0.0000000
DHRS9 825946.3 1.1875589 0.1797906 6.605232 0e+00 0.0000001
DYSF 20773280.4 1.3021073 0.2035469 6.397086 0e+00 0.0000003
UTRN 17584830.4 0.8748320 0.1393133 6.279600 0e+00 0.0000005
EFHD2 19151142.9 0.4732838 0.0791501 5.979574 0e+00 0.0000028
DOK3 4842773.4 0.5519350 0.0941608 5.861625 0e+00 0.0000050
AKAP13 9230549.6 0.4194689 0.0736838 5.692827 0e+00 0.0000105
RBPJ 1246748.9 0.5915706 0.1039267 5.692191 0e+00 0.0000105
SLK 14105526.4 0.3047064 0.0550687 5.533204 0e+00 0.0000238
GRAMD1A 249876.0 0.7319624 0.1340475 5.460473 0e+00 0.0000299
HIP1 12592345.6 0.7236909 0.1324214 5.465058 0e+00 0.0000299
HCK 6270652.0 0.7460440 0.1423116 5.242327 2e-07 0.0000922
AKR1B1 11388704.5 -0.6597371 0.1266926 -5.207387 2e-07 0.0001034
SMPDL3A 349707.7 1.0411864 0.2029895 5.129263 3e-07 0.0001466
> volcano_density_plot(res[, c("log2FoldChange", "pvalue")], shade.colour = "orange")

> suppressWarnings(DEGreport::degPlot(dds_group, res, n = 9, xs = "treat", group = "type", 
+     batch = "donor"))

> deseq2_sh16plus_effect = res

sh17 treatment effect

  • sh17 vs sh17+
> res_sh17_vs_plus = results(dds_group, list("groupsh17p", "groupsh17"))
> DESeq2::plotMA(res_sh17_vs_plus)

> res = res_sh17_vs_plus %>% as.data.frame() %>% tibble::rownames_to_column("id") %>% 
+     arrange(padj) %>% tibble::column_to_rownames("id")
> 
> res %>% head(15) %>% knitr::kable()
baseMean log2FoldChange lfcSE stat pvalue padj
CYBB 2900830.1 1.1552149 0.1527042 7.565051 0.00e+00 0.0000000
ACSL1 16567561.0 0.9316196 0.1643207 5.669519 0.00e+00 0.0000229
DYSF 20773280.4 1.1549701 0.2035469 5.674221 0.00e+00 0.0000229
EFHD2 19151142.9 0.4533775 0.0791500 5.728076 0.00e+00 0.0000229
HEMGN 266539.7 -0.7048192 0.1248420 -5.645692 0.00e+00 0.0000229
AKR1B1 11388704.5 -0.6950905 0.1266924 -5.486443 0.00e+00 0.0000477
MMP8 374974.2 0.9127065 0.1689500 5.402227 1.00e-07 0.0000656
UTRN 17584830.4 0.7242910 0.1393133 5.199009 2.00e-07 0.0001746
HCK 6270652.0 0.7336007 0.1423116 5.154891 3.00e-07 0.0001966
CYBA 1776465.4 0.8769022 0.1735016 5.054146 4.00e-07 0.0003015
SEMA6B 302314.0 0.5819162 0.1243450 4.679853 2.90e-06 0.0018198
CD109 1324552.9 -0.6074466 0.1333961 -4.553707 5.30e-06 0.0028272
TYROBP 2200631.4 0.5676309 0.1244887 4.559696 5.10e-06 0.0028272
CEP170 7763560.4 0.2428626 0.0552063 4.399184 1.09e-05 0.0054120
DHRS9 825946.3 0.7827607 0.1797907 4.353733 1.34e-05 0.0054898
> volcano_density_plot(res[, c("log2FoldChange", "pvalue")], shade.colour = "orange")

> suppressWarnings(DEGreport::degPlot(dds_group, res, n = 9, xs = "treat", group = "type", 
+     batch = "donor"))

> deseq2_sh17plus_effect = res

Proteins induced by SET in WT but not in SET-KD

The idea is to use the DE genes from nt vs nt+ and remove the ones that changed due to sh6 vs sh16+ or sh17 vs sh17+.

We used the FDR < 0.05 to get the DE and pvalue>0.2 to get the non-changed genes.

> deseq2_sh_dependent = inner_join(deseq2_ntplus_effect %>% tibble::rownames_to_column("id") %>% 
+     filter(padj < 0.05) %>% select(id, nt_padj = padj), inner_join(deseq2_sh16plus_effect %>% 
+     tibble::rownames_to_column("id") %>% filter(pvalue > 0.2) %>% select(id, 
+     sh16_padj = pvalue), deseq2_sh17plus_effect %>% tibble::rownames_to_column("id") %>% 
+     filter(pvalue > 0.2) %>% select(id, sh17_padj = pvalue), by = "id"), by = "id") %>% 
+     rowwise() %>% mutate(sh_max = max(sh16_padj, sh17_padj)) %>% mutate(diff = nt_padj - 
+     sh_max) %>% arrange((nt_padj))
> 
> suppressWarnings(DEGreport::degPlot(dds_group, deseq2_ntplus_effect[deseq2_sh_dependent$id, 
+     ], n = 9, xs = "treat", group = "type", batch = "donor"))

Differential analysis with lima-voom as microarray data

> d = model.matrix(~0 + donor + treat, metadata)
> y = normalizeBetweenArrays(log2(counts), method = "quantile")
> 
> limma_ma = voomaByGroup(y, group = metadata$type, d, plot = FALSE)
> 
> limmaPlot = function(counts, genes, metadata, xs = "time", group = "condition", 
+     batch = NULL) {
+     pp = lapply(genes, function(gene) {
+         dd = data.frame(count = counts[gene, ], time = metadata[, xs])
+         if (is.null(group)) {
+             dd$treatment = "one_group"
+         } else {
+             dd$treatment = metadata[row.names(dd), group]
+         }
+         if (!is.null(batch)) {
+             dd$batch = metadata[row.names(dd), batch]
+             p = ggplot(dd, aes(x = time, y = count, color = batch, shape = treatment))
+         } else {
+             p = ggplot(dd, aes(x = time, y = count, color = treatment, shape = treatment))
+         }
+         p = p + stat_smooth(aes(x = time, y = count, group = treatment, color = treatment), 
+             fill = "grey80") + geom_jitter(size = 1, alpha = 0.7, height = 0, 
+             width = 0.2) + theme_bw(base_size = 7) + ggtitle(gene)
+         if (length(unique(dd$treatment)) == 1) {
+             p = p + scale_color_brewer(guide = FALSE, palette = "Set1") + scale_fill_brewer(guide = FALSE, 
+                 palette = "Set1")
+         }
+         p
+     })
+     n = ceiling(length(pp))
+     do.call(grid.arrange, pp)
+ }

WT treatment effect

> library(limma)
> keep = grepl("nt", row.names(metadata))
> d = model.matrix(~0 + donor + treat, metadata[keep, ])
> y = normalizeBetweenArrays(log2(counts[, keep]), method = "quantile")
> 
> v = voomaByGroup(y, group = metadata$type[keep], d, plot = TRUE)

> f = lmFit(v, d)
> fb = eBayes(f)
> 
> 
> res = topTable(fb, coef = "treatplus", sort.by = "p", number = Inf)
> 
> knitr::kable(head(res, 15))
logFC AveExpr t P.Value adj.P.Val B
CXCL8 2.581916 19.80694 9.767088 0.0000092 0.0344386 3.6667038
SLPI 1.890419 22.79432 9.091814 0.0000157 0.0344386 3.4500397
S100A12 3.169849 23.15430 9.039778 0.0000164 0.0344386 3.3942174
C14orf93 3.086437 18.35416 8.596161 0.0000238 0.0344386 2.7013356
OSR2 3.342374 15.58042 8.542010 0.0000250 0.0344386 1.8329605
CSF3 4.465571 18.46044 8.437917 0.0000273 0.0344386 2.5017388
PCDHB2 1.893239 20.03219 7.839069 0.0000468 0.0492400 2.4330637
KLHL35 3.440221 15.72689 7.673301 0.0000547 0.0492400 2.0309624
CD177 2.529608 19.80707 7.587720 0.0000593 0.0492400 2.1975587
S100A8 3.021722 26.91976 7.488745 0.0000651 0.0492400 2.1333866
S100A9 2.390447 26.73337 7.249860 0.0000821 0.0564570 1.9374803
NCAM1 1.847449 18.76559 7.071941 0.0000980 0.0590235 1.7140009
DEFA3 4.510520 15.14801 6.986526 0.0001068 0.0590235 0.6898536
TOPAZ1 2.491827 17.03690 6.963874 0.0001093 0.0590235 1.3456436
DYSF 1.804080 24.39469 6.709349 0.0001420 0.0715635 1.5057166
> volcano_density_plot(res[, c("logFC", "P.Value")], shade.colour = "orange")

> suppressWarnings(DEGreport::degPlot(dds_group, res, n = 9, xs = "treat", group = "type", 
+     batch = "donor"))

> ntplus_effect = res

sh16/sh17 treatment effect

> library(limma)
> keep = !grepl("nt", row.names(metadata))
> d = model.matrix(~0 + donor + treat + type, metadata[keep, ])
> y = normalizeBetweenArrays(log2(counts[, keep]), method = "quantile")
> 
> v = voomaByGroup(y, group = metadata$type[keep], d, plot = TRUE)

> f = lmFit(v, d)
> fb = eBayes(f)
> 
> res = topTable(fb, coef = "treatplus", sort.by = "p", number = Inf)
> 
> knitr::kable(head(res, 15))
logFC AveExpr t P.Value adj.P.Val B
RETN 1.5377891 20.66102 23.35461 1.00e-07 0.0006611 7.588722
S100A9 2.4332080 26.43037 21.01202 2.00e-07 0.0006758 6.936341
CYBA 1.0056334 20.53080 17.32453 7.00e-07 0.0016527 6.296766
FCAR 1.3789761 21.01399 16.57269 9.00e-07 0.0016692 6.150770
CYBB 1.2631403 21.18817 15.49117 1.40e-06 0.0020974 5.834846
CD177 2.6267508 19.20314 14.66743 2.00e-06 0.0025165 5.263359
SULT1B1 1.0076530 19.38423 12.61322 5.40e-06 0.0044148 4.563903
ACSL1 1.1690543 23.65905 12.60906 5.40e-06 0.0044148 4.768172
CES1 0.5766228 20.75964 12.49156 5.80e-06 0.0044148 4.637054
ITGAM 0.7883593 22.90557 12.17975 6.80e-06 0.0044148 4.566317
NAMPT 0.7974357 24.81606 12.17556 6.90e-06 0.0044148 4.524472
MCEMP1 1.4549909 19.34002 12.13479 7.00e-06 0.0044148 4.348670
HLA-DQA1 -0.9592994 19.89827 -11.19283 1.19e-05 0.0066671 3.953712
HIP1 0.6582028 23.38038 11.13031 1.23e-05 0.0066671 4.017406
ACPP 0.6758494 19.61586 10.86617 1.44e-05 0.0069528 3.770140
> volcano_density_plot(res[, c("logFC", "P.Value")], shade.colour = "orange")

> suppressWarnings(DEGreport::degPlot(dds_group, res, n = 9, xs = "treat", group = "type", 
+     batch = "donor"))

> shplus_effect = res

Proteins induced by SET in WT but not in KD

> sh_dependent = inner_join(ntplus_effect %>% tibble::rownames_to_column("id") %>% 
+     filter(adj.P.Val < 0.1), shplus_effect %>% tibble::rownames_to_column("id") %>% 
+     filter(adj.P.Val > 0.2), by = "id")
> 
> suppressWarnings(DEGreport::degPlot(dds_group, ntplus_effect[sh_dependent$id, 
+     ], n = 8, xs = "treat", group = "type", batch = "donor"))

> # suppressWarnings(limmaPlot(limma_ma$E, sh_dependent$id, metadata, xs =
> # 'treat', group = 'type', batch = 'donor' ))

Normalize with RUVseq

It doesn’t help a lot this time. I guess too few replicates to remove noise from any of them.

> library(RUVSeq)
> ruv = RUVs(counts(dds_group), cIdx = row.names(counts), k = 1, scIdx = matrix(c(1:6, 
+     7:12), nrow = 6))
> mds(ruv$normalizedCounts, k = 2, metadata$group) + ggtitle("1 factor")

> ruv = RUVs(as.matrix(counts), cIdx = row.names(counts), k = 2, scIdx = matrix(c(1:6, 
+     7:12), nrow = 6))
> mds(ruv$normalizedCounts, k = 2, metadata$group) + ggtitle("2 factors")

> ruv = RUVs(as.matrix(counts), cIdx = row.names(counts), k = 3, scIdx = matrix(c(1:6, 
+     7:12), nrow = 6))
> mds(ruv$normalizedCounts, k = 2, metadata$group) + ggtitle("3 factors")